# Introduction #########
#
#
# The following is an analysis of one-hundred styles of beer brewed in the United States for the executive team,
# CEO and CFO at Budweiser. Budweiser is interested in exploring the how many breweries are in the United States, 
# how each beer is reported in terms of its International Bitterness Unit and Alcohol By Content and basic summary
# statistics and conclusions we are able to uncover with the beer data provided. Statistics will include handling
# missing data and explaining why it was possibly not included in the initial dataset, as well as uncover median 
# and maximum (IBU and ABV) ratings by state. Conclusions will include basic summary statics on the ABV variable, 
# any relationship between the IBU and ABV variables (such as dependencies, e.g. does a higher IBU result in a 
# higher ABV) and finally we will look to see if we can determine general beer styles (Ales and IPAs) based on 
# ABV and IBU values. Additionally, we will report on any findings that are discovered during the analysis.
#
######################
#                    #
#     Libraries      #
#                    #
######################
######################
library(usmap)
library(ggplot2)
library(magrittr)
library(ggplot2)
library(GGally)
## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2
library(readr)
library(tibble)
library(tidyverse)
## ── Attaching packages ────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ tidyr   1.1.2     ✓ stringr 1.4.0
## ✓ purrr   0.3.4     ✓ forcats 0.5.0
## ✓ dplyr   1.0.2
## ── Conflicts ───────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## x tidyr::extract()   masks magrittr::extract()
## x dplyr::filter()    masks stats::filter()
## x dplyr::lag()       masks stats::lag()
## x purrr::set_names() masks magrittr::set_names()
library(robustbase)
library(plyr)
## ------------------------------------------------------------------------------
## You have loaded plyr after dplyr - this is likely to cause problems.
## If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
## library(plyr); library(dplyr)
## ------------------------------------------------------------------------------
## 
## Attaching package: 'plyr'
## The following objects are masked from 'package:dplyr':
## 
##     arrange, count, desc, failwith, id, mutate, rename, summarise,
##     summarize
## The following object is masked from 'package:purrr':
## 
##     compact
library(class)
library(caret)
## Loading required package: lattice
## 
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
## 
##     lift
library(e1071)
library(dplyr)
library(RColorBrewer)
library(car)
## Loading required package: carData
## 
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
## 
##     recode
## The following object is masked from 'package:purrr':
## 
##     some
######################
######################
#                    #
#        Data        #
#                    #
######################
########################################################
#read in brewery data
breweryDat <- read.csv("breweries.csv")
breweryDat$State <- trimws(breweryDat$State)
#datafile to organize states into census regions
regionData <- read.csv("stategeocodes.csv")
regionData <- regionData[,c(-1,-2)]
regionData <- dplyr::rename(regionData, "FIPS"="State..FIPS.", "Region" = "Region.1", "Division" = "Division.1")
regionData$State <- trimws(regionData$State)
#Ensure structure of data is compliant 
#head(breweryDat)
#read in beer data
beerDat <- read.csv("beers.csv")
#Loop to fix leading decimal places on ABV
i <- 1
count <- length(beerDat$Name)
for (i in 1:count) {
 if(is.na(beerDat[i,3])){
  beerDat[i,3]=0
   }
   if(beerDat[i,3]<1){
    beerDat[i,3] <- beerDat[i,3]*100
  }
}
#Ensure structure of data is compliant 
#head(beerDat)
########################################################
# Question 1 - How many breweries are in each state?
#
# During this analysis, we explored how many breweries are in each state and grouped the states 
# by US Census Divisions. The data is visually displayed using maps of each USC Division below 
# and summarized in a simple chart at the end.
#
######################
#                    #
#     Question 1     #
#                    #
######################
#
#########################################################
#Use Dplyr to group breweries by state
brewByState <- breweryDat %>% 
  group_by(State) %>% 
  dplyr::count()
#########################################################
#########################################################
#Add breweries by state to state information dataframe
statepop$brewByState <- brewByState$n
#########################################################
#########################################################
#Fix mismatched state brewery count to state info df
statepop[1,5] <- 3
statepop[2,5] <- 7
statepop[3,5] <- 11
statepop[4,5] <- 2
statepop[8,5] <- 2
statepop[9,5] <- 1
statepop[14,5] <- 18
statepop[15,5] <- 22
statepop[16,5] <- 5
statepop[20,5] <- 9
statepop[22,5] <- 23
statepop[25,5] <- 2
statepop[26,5] <- 9
statepop[28,5] <- 5
statepop[29,5] <- 2
statepop[30,5] <- 3
statepop[32,5] <- 4
statepop[34,5] <- 19
statepop[33,5] <- 16
statepop[35,5] <- 1
statepop[45,5] <- 4
statepop[46,5] <- 10
statepop[47,5] <- 16
statepop[49,5] <- 1
statepop[50,5] <- 20
#Check data 
#View(statepop)
#View(brewByState)
#########################################################
#########################################################
#Call plot functions to plot state brewery count on USmap
nationBrewPlot <- plot_usmap(data = statepop, values = "brewByState",labels=TRUE, color = "grey73") + scale_fill_continuous(low = "purple", high = "green", name = "Brewery Count", label = scales::comma) + theme(legend.position = "bottom")+labs(title = "Total Brewery Count Per State")
#display plot
nationBrewPlot
## Warning: Use of `map_df$x` is discouraged. Use `x` instead.
## Warning: Use of `map_df$y` is discouraged. Use `y` instead.
## Warning: Use of `map_df$group` is discouraged. Use `group` instead.
## Warning: Use of `centroid_labels$x` is discouraged. Use `x` instead.
## Warning: Use of `centroid_labels$y` is discouraged. Use `y` instead.
## Warning: Use of `centroid_labels$abbr` is discouraged. Use `abbr` instead.

#########################################################
#########################################################
#Break down by region, NE first
NEplot <- plot_usmap(data=statepop, values = "brewByState",labels = TRUE,include = .new_england,color = "grey73") + scale_fill_continuous(low = "purple", high = "green", name = "Brewery Count", label = scales::comma)+ theme(legend.position = "bottom")+labs(title = "Total Brewery Count Per State")
NEplot
## Warning: Use of `map_df$x` is discouraged. Use `x` instead.
## Warning: Use of `map_df$y` is discouraged. Use `y` instead.
## Warning: Use of `map_df$group` is discouraged. Use `group` instead.
## Warning: Use of `centroid_labels$x` is discouraged. Use `x` instead.
## Warning: Use of `centroid_labels$y` is discouraged. Use `y` instead.
## Warning: Use of `centroid_labels$abbr` is discouraged. Use `abbr` instead.

#########################################################
#########################################################
#Break down by region, Mid Atlantic second
MAplot <- plot_usmap(data=statepop, values = "brewByState",labels = TRUE,include = .mid_atlantic,color = "grey73") + scale_fill_continuous(low = "purple", high = "green", name = "Brewery Count", label = scales::comma)+ theme(legend.position = "bottom")+labs(title = "Total Brewery Count Per State")
MAplot
## Warning: Use of `map_df$x` is discouraged. Use `x` instead.
## Warning: Use of `map_df$y` is discouraged. Use `y` instead.
## Warning: Use of `map_df$group` is discouraged. Use `group` instead.
## Warning: Use of `centroid_labels$x` is discouraged. Use `x` instead.
## Warning: Use of `centroid_labels$y` is discouraged. Use `y` instead.
## Warning: Use of `centroid_labels$abbr` is discouraged. Use `abbr` instead.

#########################################################
#########################################################
#Break down by region, East North Central third
ENCplot <- plot_usmap(data=statepop, values = "brewByState",labels = TRUE,include = .east_north_central,color = "grey73") + scale_fill_continuous(low = "purple", high = "green", name = "Brewery Count", label = scales::comma)+ theme(legend.position = "bottom")+labs(title = "Total Brewery Count Per State")
ENCplot
## Warning: Use of `map_df$x` is discouraged. Use `x` instead.
## Warning: Use of `map_df$y` is discouraged. Use `y` instead.
## Warning: Use of `map_df$group` is discouraged. Use `group` instead.
## Warning: Use of `centroid_labels$x` is discouraged. Use `x` instead.
## Warning: Use of `centroid_labels$y` is discouraged. Use `y` instead.
## Warning: Use of `centroid_labels$abbr` is discouraged. Use `abbr` instead.

#########################################################
#########################################################
#Break down by region, West North Central fourth
WNCplot <- plot_usmap(data=statepop, values = "brewByState",labels = TRUE,include = .west_north_central,color = "grey73") + scale_fill_continuous(low = "purple", high = "green", name = "Brewery Count", label = scales::comma)+ theme(legend.position = "bottom")+labs(title = "Total Brewery Count Per State")
WNCplot
## Warning: Use of `map_df$x` is discouraged. Use `x` instead.
## Warning: Use of `map_df$y` is discouraged. Use `y` instead.
## Warning: Use of `map_df$group` is discouraged. Use `group` instead.
## Warning: Use of `centroid_labels$x` is discouraged. Use `x` instead.
## Warning: Use of `centroid_labels$y` is discouraged. Use `y` instead.
## Warning: Use of `centroid_labels$abbr` is discouraged. Use `abbr` instead.

#########################################################
#########################################################
#Break down by region, South Atlantic fifth
SAplot <- plot_usmap(data=statepop, values = "brewByState",labels = TRUE,include = .south_atlantic,color = "grey73") + scale_fill_continuous(low = "purple", high = "green", name = "Brewery Count", label = scales::comma)+ theme(legend.position = "bottom")+labs(title = "Total Brewery Count Per State")
SAplot
## Warning: Use of `map_df$x` is discouraged. Use `x` instead.
## Warning: Use of `map_df$y` is discouraged. Use `y` instead.
## Warning: Use of `map_df$group` is discouraged. Use `group` instead.
## Warning: Use of `centroid_labels$x` is discouraged. Use `x` instead.
## Warning: Use of `centroid_labels$y` is discouraged. Use `y` instead.
## Warning: Use of `centroid_labels$abbr` is discouraged. Use `abbr` instead.

#########################################################
#########################################################
#Break down by region, East South Central sixth
ESCplot <- plot_usmap(data=statepop, values = "brewByState",labels = TRUE,include = .east_south_central,color = "grey73") + scale_fill_continuous(low = "purple", high = "green", name = "Brewery Count", label = scales::comma)+ theme(legend.position = "bottom")+labs(title = "Total Brewery Count Per State")
ESCplot
## Warning: Use of `map_df$x` is discouraged. Use `x` instead.
## Warning: Use of `map_df$y` is discouraged. Use `y` instead.
## Warning: Use of `map_df$group` is discouraged. Use `group` instead.
## Warning: Use of `centroid_labels$x` is discouraged. Use `x` instead.
## Warning: Use of `centroid_labels$y` is discouraged. Use `y` instead.
## Warning: Use of `centroid_labels$abbr` is discouraged. Use `abbr` instead.

#########################################################
#########################################################
#Break down by region, West South Central seventh
WSCplot <- plot_usmap(data=statepop, values = "brewByState",labels = TRUE,include = .west_south_central,color = "grey73") + scale_fill_continuous(low = "purple", high = "green", name = "Brewery Count", label = scales::comma)+ theme(legend.position = "bottom")+labs(title = "Total Brewery Count Per State")
WSCplot
## Warning: Use of `map_df$x` is discouraged. Use `x` instead.
## Warning: Use of `map_df$y` is discouraged. Use `y` instead.
## Warning: Use of `map_df$group` is discouraged. Use `group` instead.
## Warning: Use of `centroid_labels$x` is discouraged. Use `x` instead.
## Warning: Use of `centroid_labels$y` is discouraged. Use `y` instead.
## Warning: Use of `centroid_labels$abbr` is discouraged. Use `abbr` instead.

#########################################################
#########################################################
#Break down by region, Mountain eighth 
Mplot <- plot_usmap(data=statepop, values = "brewByState",labels = TRUE,include = .mountain,color = "grey73") + scale_fill_continuous(low = "purple", high = "green", name = "Brewery Count", label = scales::comma)+ theme(legend.position = "bottom")+labs(title = "Total Brewery Count Per State")
Mplot
## Warning: Use of `map_df$x` is discouraged. Use `x` instead.
## Warning: Use of `map_df$y` is discouraged. Use `y` instead.
## Warning: Use of `map_df$group` is discouraged. Use `group` instead.
## Warning: Use of `centroid_labels$x` is discouraged. Use `x` instead.
## Warning: Use of `centroid_labels$y` is discouraged. Use `y` instead.
## Warning: Use of `centroid_labels$abbr` is discouraged. Use `abbr` instead.

#########################################################
#########################################################
#Break down by region, Pacific ninth 
Pplot <- plot_usmap(data=statepop, values = "brewByState",labels = TRUE,include = .pacific,color = "grey73") + scale_fill_continuous(low = "purple", high = "green", name = "Brewery Count", label = scales::comma)+ theme(legend.position = "bottom")+labs(title = "Total Brewery Count Per State")
Pplot
## Warning: Use of `map_df$x` is discouraged. Use `x` instead.
## Warning: Use of `map_df$y` is discouraged. Use `y` instead.
## Warning: Use of `map_df$group` is discouraged. Use `group` instead.
## Warning: Use of `centroid_labels$x` is discouraged. Use `x` instead.
## Warning: Use of `centroid_labels$y` is discouraged. Use `y` instead.
## Warning: Use of `centroid_labels$abbr` is discouraged. Use `abbr` instead.

#########################################################
#########################################################
#### Bar Plot ####
#Plot overall breweries by state in bar chart
brewByState %>% ggplot(aes(y=reorder(State, n), x= n, fill = "#C8102E")) +
  geom_bar(stat = "identity", show.legend = FALSE, position = 'dodge') +
  geom_text(aes(label = brewByState$n), position=position_dodge(width=0.9), hjust = -0.25, vjust= .2, size = 3) +
  theme_classic() + 
  labs(title = "Breweries by State in the USA", 
       subtitle = "Budweiser Consultation",
       x = "Number of Breweries", y = "State")

######################
#                    #
#     Question 2     #
#                    #
######################
#############################################################################################################
# Question 2 - Merge the individual data sets
#
# We merged the breweries.csv dataset with the beers.csv dataset, additionally when we imported
# the individual datasets, we also imported a dataset that allows us to associate each beer with 
# its brewery's US Census Division.  
#
#############################################################################################################
#Use Dplyr package to merge the two tables together
buzzbrews <- merge(breweryDat, beerDat, by.x = "Brew_ID", by.y = "Brewery_id", all = TRUE )
#Use Dplyr package to rename "Name.x" to "Brewery" and "Name.y" to "Beer"
buzzbrews <- dplyr::rename(buzzbrews, "Brewery" = "Name.x","Beer"="Name.y")
bzbwTestDf <- buzzbrews
#Check the results
#View(buzzbrews)
######################
#                    #
#     Question 3     #
#                    #
######################
###########################################
# Question 3 - Address the missing values in each column.
#
# During the initial exploratory process we discovered NA's in both the IBU and ABV columns. 
# Upon further investigation we determined that some styles of beer, mixed or barrel aged beers
# do not have an ABV available at the time the brewery submits packaging labels to TTB, or Alcohol
# and Tobacco Tax and Trade Bureau. The TTB is the federal agency that determines what can and cannot
# be put on a beer label including the art, type size, verbiage, where elements are placed and etc.
# So beers without an AVB available either do not inlcude it, or add it to the bottom of the cans 
# or packaging at a later date. 
#
# In terms of the missing IBU values, we determined that even though the IBU alludes to the bitterness
# of a beer's taste, it is somewhat misleading because it is derived from a test that measures different
# chemical compounds that are known to cause bitter flavoring. For instance, a beer may have a high IBU 
# value, but due to other ingredients, such as added lactose or sucrose may actually have a sweeter taste 
# than would be expected from a high IBU. The other comfounding variable is if the brewery can afford the 
# equipment used to generate an IBU value, smaller breweries simply cannot afford it while the larger 
# breweries typically just use IBU as a quality control measure.
#
# Finally, we concluded that imputing data or filling in the missing gaps was a good idea for this
# analysis and that was done by taking an average of from similiar styles of beer and assigning that to
# beers in the same sytle classification that did not have values. Upon random testing of different imputed
# values, by googling beers that had missing values in the dataset and comparing that to the created averages,
# it was determined that the imputed values were very close to the actual values in the marketplace.
#
###########################################
#Loop to fix numbering for Column 1 "brew ID"
iterations <- length(buzzbrews$Brew_ID)
for (i in 1:iterations) {
  buzzbrews[i,1]=i
}
#Fix no style beers to none
levels(buzzbrews$Style) <- c(levels(buzzbrews$Style), "none")
for (i in 1:iterations) {
  if(is.na(buzzbrews[i,9])){
  }
}
for (i in 1:iterations) {
  if((buzzbrews[i,9])==''){
    #print(buzzbrews[i,9])
    buzzbrews[i,9]="none"
  }
}
#Prep new df to contain style and averages
buzzbrews$Style <- as.factor(buzzbrews$Style)
#Create a data frame with each style and a variable for average IBU
styleCount <- as.data.frame(levels(buzzbrews$Style))
styleCount$`levels(buzzbrews$Style)` <- as.character(styleCount$`levels(buzzbrews$Style)`)
#View(styleCount)
#Initialize mean ibu to zero (to avoid problems with N/As)
styleCount$meanIbu <- 0
#Make beer count to keep track of total in each style
styleCount$beerCount <- 0
#Make column for total ibus
styleCount$totalIBU <- 0
styleCount$meanABV <- 0
styleCount$ABVbeerCount <- 0
styleCount$totalABV <- 0
#Checking
#View(styleCount)
#styleCount <- styleCount[-c(1), ]
#View(styleCount)
#Calculate mean IBU for each category and store it in IBU df
#Calculate average IBU for each style and add it to df
#outer loop for all the beers
ibuSum <- 0
beerCount <- 0
i <- 1
for (i in 1:iterations) {
  if(is.na(buzzbrews[i,8])) {
    buzzbrews[i,8]=0
  }
  
  #inner for each style
  for (j in 1:100) {
      
    if(buzzbrews[i,9]==styleCount[j,1]){
     #Compute IBU sum
     styleCount[j,4] <- styleCount[j,4]+buzzbrews[i,8]
     
     
     
     #Total of each beer count
     styleCount[j,3] <- styleCount[j,3]+1
     
     if(buzzbrews[i,8]==0){
       styleCount[j,3] <- styleCount[j,3]-1
     }
    
     
     
    }
    #Mean IBU for each style
    styleCount[j,2] <- styleCount[j,4]/styleCount[j,3]
    }}
#Add average column from style count to buzzbrews df
for (i in 1:iterations) {
  if(buzzbrews[i,8]==0){
    for(j in 1:100){
      if(buzzbrews[i,9]==styleCount[j,1]){
        buzzbrews[i,8]=styleCount[j,2]
      }
    }
  }
}
# View(styleCount)
# View(buzzbrews)
# Now do it all again for ABV
# Calculate average ABV for each style and add it to df
# outer loop for all the beers
AlcSum <- 0
AlcVeerCount <- 0
i <- 1
for (i in 1:iterations) {
  if(is.na(buzzbrews[i,7])) {
   buzzbrews[i,7]=0
  }
  
  
  #inner for each style
  for (j in 1:100) {
   
    if(buzzbrews[i,9]==styleCount[j,1]){
     
     #Compute ALC sum
     styleCount[j,7] <- styleCount[j,7]+buzzbrews[i,7]*100
     
     
     
     #Total of each beer count
     styleCount[j,6] <- styleCount[j,6]+1
     
     if(buzzbrews[i,7]==0){
       styleCount[j,6] <- styleCount[j,6]-1
     }
    
     
     
    }
    #Mean ABV for each style
    styleCount[j,5] <- (styleCount[j,7]/styleCount[j,6])/100
    }
}
#Add average column from style count to buzzbrews df
for (i in 1:iterations) {
  if(buzzbrews[i,7]==0){
    for(j in 1:100){
      if(buzzbrews[i,9]==styleCount[j,1]){
        buzzbrews[i,7]=styleCount[j,5]
        
      }
      }
  }
}
#kill NaN's for other alcohol types with no hops
i <- 1
for(i in 1:iterations){
  if(is.na(buzzbrews[i,8])){
    buzzbrews[i,8] <- 0
  }
}
#Check out end results
buzzbrews <- merge(buzzbrews, regionData, by = "State")
View(buzzbrews)
######################
#                    #
#     Question 4     #
#                    #
######################
#################################################################################################################
# Question 4 - Compute the median alcohol content and international bitterness unit for 
# each state. Plot a bar chart to compare.
#
# We computed the MedStateABV and IBU for each state and created a visualisation that allowed
# us to further explore what those medians tell us. We found there appears to be a relationship
# between IBU and ABV where we can use IBU to estimate ABV of a given beer. 
#
# We explored this further by developing a model to make predictions based on historical IBU
# and ABV data and were able to predict that a beer with 32 IBU could have an ABV of 5.72% and
# we were 97.5% confident that beer would at least fall between 3.24% and 8.21%.
#
#################################################################################################################
buzzbrews$State <- trimws(buzzbrews$State)
# Group by state and compute 
combineddf <- buzzbrews %>%
  group_by(State) %>%
  dplyr::summarise(MedStateIBU = median(IBU), MedStateABV = median(ABV))
## `summarise()` ungrouping output (override with `.groups` argument)
combineddf <- as.data.frame(combineddf)
combineddf$MedStateIBU <- as.numeric(combineddf$MedStateIBU)
combineddf$MedStateABV <- as.numeric(combineddf$MedStateABV)
# Divisional measurements
divisiondf <- buzzbrews %>% 
  group_by(Division) %>% 
  dplyr::summarise(MedDivIBU = median(IBU), MedDivABV = median(ABV))
## `summarise()` ungrouping output (override with `.groups` argument)
# round values to xx.x ###
divisiondf$MedDivIBU <- round(divisiondf$MedDivIBU, digits = 1)
divisiondf$MedDivABV <- round(divisiondf$MedDivABV, digits = 1) 
combineddf$MedStateIBU <- round(combineddf$MedStateIBU, digits = 1)
combineddf$MedStateABV <- round(combineddf$MedStateABV, digits = 1)
# Add regions to combinddf
combineddf <- merge(combineddf,regionData,by="State")
# Add in divisional values
combineddf <- merge(combineddf, divisiondf, by = "Division")
####### Create chart labels for stacked charts #####
combineddf$ABVlabel <- paste(combineddf$State, combineddf$MedStateABV)
combineddf$IBUlabel <- paste(combineddf$State, combineddf$MedStateIBU)
view(combineddf)
# Create sums of medians for labeling charts #
StateSums <- combineddf %>%
  group_by(Division) %>%
  dplyr::summarise(SumStateABV = sum(MedStateABV), SumStateIBU = sum(MedStateIBU))
## `summarise()` ungrouping output (override with `.groups` argument)
combineddf <- merge(combineddf, StateSums, by = "Division")
#
############################################################
#########                                   ################
######### Draw Bar Chart = Median State ABV ################
#########                                   ################
############################################################
#
combineddf %>%
  ggplot(aes(x=Division, y=MedStateABV,fill= reorder(State,-MedStateABV))) +   
  # Create stacked by chart organized by Division with States stacked in each bar
  geom_bar(aes(color = "#c8102e"),stat="identity", width= 0.7, position = position_stack(), show.legend = FALSE) + 
  # Add state and ABV value to each state's chart position
  geom_text(aes(label = ABVlabel), size = 3, position = position_stack(vjust = 0.5)) + 
  # Add Division ABV Values to top of each chart stack
  geom_text(aes(Division, MedDivABV + SumStateABV -3, label = MedDivABV), size = 3, vjust = 1, fontface = "italic") +
  # Label the chart objects
  labs(title="Median ABV by State by US Census Division in the USA", 
       subtitle="Budweiser Consultation", 
       caption="source: ABV. ABV imputed where necessary.",
       y = "Alcohol By Volume",
       x = "States by US Census Divisions ") + 
  theme_classic() +
  # Adjust the X-axis labels, remove y-labels since this is a stacked chart
  theme(axis.text.x = element_text(angle=90, vjust = 0.5,hjust = 1), 
        axis.text.y = element_blank(), axis.ticks = element_blank())

#
##### Create bar plot for IBU #####
#
combineddf %>%
  ggplot(aes(x=Division, y=MedStateIBU,fill= reorder(State,-MedStateIBU))) +   
  # Create stacked by chart organized by Division with States stacked in each bar
  geom_bar(aes(color = "#c8102e"),stat="identity", width= 0.7, position = position_stack(), show.legend = FALSE) + 
  # Add state and IBU value to each state's chart position
  geom_text(aes(label = IBUlabel), size = 3, position = position_stack(vjust = 0.5)) + 
  # Add Division IBU Values to top of each chart stack
  geom_text(aes(Division, MedDivIBU + SumStateIBU - 15, label = MedDivIBU), size = 3, vjust = 1, fontface = "italic") +
  # Label the chart objects
  labs(title="Median IBU by State by US Census Division in the USA", 
       subtitle="Budweiser Consultation", 
       caption="source: IBU. IBU imputed where necessary.",
       y = "Median Int'l Bitterness Unit",
       x = "States by US Census Divisions ") + 
  theme_classic() +
  # Adjust the X-axis labels, remove y-labels since this is a stacked chart
  theme(axis.text.x = element_text(angle=90, vjust = 0.5,hjust = 1), 
        axis.text.y = element_blank(), axis.ticks = element_blank())

#
#
####################################################
##                                                ##
## Scatterplot MedStateIBU vs MedStateABV by State  ##
##                                                ##
####################################################
## Calculate slope and intercept of line of best fit ##
abline_values <- coef(lm(MedStateABV ~ MedStateIBU, combineddf))
#  (Intercept) MedStateABV 
#   14.5377926    0.4013998 
ggplot(combineddf,aes(x = MedStateIBU, y = MedStateABV, color = State)) +
  geom_point(show.legend = FALSE) +
  # Add ABLine to the chart to see if there is a linear relationship
  geom_abline(intercept = abline_values[1] , slope = abline_values[2] , color = "#c8102E", size = 1) +   
  # Add state labels, but only for outliers
  geom_text(data = subset(combineddf,MedStateIBU > 45 | MedStateABV < 4.5, 
                          select = c(State, MedStateIBU, MedStateABV)), 
            aes(label = State), vjust= -0.6, size = 3, na.rm = TRUE, 
            show.legend = FALSE, color = "#000000") +
  theme_classic() + 
  labs(title = "Median State ABV vs Median State IBU", 
       subtitle = "Budweiser Consultation",
       y = "Median Alcoholic By Vol", 
       x = "Median Int'l Bitterness Unit",
       caption = "NOTE: Missing ABV and IBU values imputed")

######################
#                    #
#     Question 5     #
#                    #
######################
#############################################################################################################
# Question 5 - Which state has the maximum alcoholic (ABV) beer? Which state has the most bitter (IBU) beer?
#
# We determined that the maximum observed IBU was 138 in Oregon for Bitter Bitch Imperial IPA that 
# is an American Double/ Imperial IPA from the Astoria Brewing Company in Austoria, OR. 
# 
# We also determined that maximum observed ABV was 12.8% in Colorado for Lee Hill Series Vol. 5 – 
# Belgian Style Quadrupel Ale from Upslope Brewing Company in Boulder, CO.
#
#############################################################################################################
#Figure out which has highest ABV
MaxStateABV <- arrange(buzzbrews, desc(ABV))
print(MaxStateABV[1,4])
## [1] "Boulder"
#Figure out which has highest IBU
maxIBU <- arrange(buzzbrews,desc(IBU))
print(maxIBU[1,4])
## [1] "Astoria"
##### Question 5 Answer #####
## Colorado has the highest ABV = 12.8, Oregon has the highest IBU = 138.
###################################################
###### Create DF for just the max ABV & IBU values ######
# State measurements
maxStateValues <- buzzbrews %>% 
  group_by(State) %>% 
  dplyr::count(MaxStateABV = max(ABV), MaxStateIBU = max(IBU))
maxStateValues <- maxStateValues[,-4]
maxStateValues <- as.data.frame(maxStateValues)
maxStateValues$State <- trimws(maxStateValues$State)
str(maxStateValues)
## 'data.frame':    51 obs. of  3 variables:
##  $ State      : chr  "AK" "AL" "AR" "AZ" ...
##  $ MaxStateABV: num  6.8 9.3 6.1 9.5 9.9 ...
##  $ MaxStateIBU: num  71 103 45.7 99 115 ...
view(maxStateValues)
# Divisional measurements
divMaxValdf <- buzzbrews %>% 
  group_by(Division) %>% 
  dplyr::count(MaxDivABV = max(ABV), MaxDivIBU = max(IBU))
divMaxValdf <- divMaxValdf[,-4]
divMaxValdf <- as.data.frame(divMaxValdf)
# round values to xx.x ###
maxStateValues$MaxStateABV <- round(maxStateValues$MaxStateABV, digits = 1)
maxStateValues$MaxStateIBU <- round(maxStateValues$MaxStateIBU, digits = 1)
# Add regions to maxStateValues
maxStateValues <- merge(maxStateValues,regionData,by="State")
# Add in divisional values
maxStateValues <- merge(maxStateValues, divMaxValdf, by = "Division")
####### Create chart labels for stacked charts #####
maxStateValues$ABVmaxLabel <- paste(maxStateValues$State, maxStateValues$MaxStateABV)
maxStateValues$IBUmaxLabel <- paste(maxStateValues$State, maxStateValues$MaxStateIBU)
view(maxStateValues)
# Create sums of max values for labeling charts #
StateMaxSums <- maxStateValues %>%
  group_by(Division) %>%
  dplyr::summarise(SumStateABV = sum(MaxStateABV), SumStateIBU = sum(MaxStateIBU))
## `summarise()` ungrouping output (override with `.groups` argument)
maxStateValues <- merge(maxStateValues, StateMaxSums, by = "Division")
###################################################
###### Plot for Max ABV ###########################
maxStateValues %>%
  ggplot(aes(x=Division, y=MaxStateABV,fill= reorder(State,-MaxStateABV))) +   
  # Create stacked by chart organized by Division with States stacked in each bar
  geom_bar(aes(color = "#c8102e"),stat="identity", width= 0.7, position = position_stack(), show.legend = FALSE) + 
  # Add state and ABV value to each state's chart position
  geom_text(aes(label = ABVmaxLabel), size = 3, position = position_stack(vjust = 0.5)) + 
  # Add Division ABV Values to top of each chart stack
  geom_text(aes(Division, MaxDivABV + SumStateABV, label = MaxDivABV), size = 3, nudge_y = -7, fontface = "italic") +
  # Label the chart objects
  labs(title="Max ABV by State by US Census Division in the USA", 
       subtitle="Budweiser Consultation", 
       caption="source: ABV. ABV imputed where necessary.",
       y = "Alcohol By Volume",
       x = "States by US Census Divisions ") + 
  theme_classic() +
  # Adjust the X-axis labels, remove y-labels since this is a stacked chart
  theme(axis.text.x = element_text(angle=90, vjust = 0.5,hjust = 1), 
        axis.text.y = element_blank(), axis.ticks = element_blank())

###################################################
############### Chart Max IBU #####################
maxStateValues %>%
  ggplot(aes(x=Division, y=MaxStateIBU,fill= reorder(State,-MaxStateIBU))) +   
  # Create stacked by chart organized by Division with States stacked in each bar
  geom_bar(aes(color = "#c8102e"),stat="identity", width= 0.7, position = position_stack(), show.legend = FALSE) + 
  # Add state and ABV value to each state's chart position
  geom_text(aes(label = IBUmaxLabel), size = 3, position = position_stack(vjust = 0.5)) + 
  # Add Division ABV Values to top of each chart stack
  geom_text(aes(Division, MaxDivIBU + SumStateIBU, label = MaxDivIBU), size = 3, nudge_y = -75, fontface = "italic") +
  # Label the chart objects
  labs(title="Max IBU by State by US Census Division in the USA", 
       subtitle="Budweiser Consultation", 
       caption="source: IBU imputed where necessary.",
       y = "Alcohol By Volume",
       x = "States by US Census Divisions ") + 
  theme_classic() +
  # Adjust the X-axis labels, remove y-labels since this is a stacked chart
  theme(axis.text.x = element_text(angle=90, vjust = 0.5,hjust = 1), 
        axis.text.y = element_blank(), axis.ticks = element_blank())

######################
#                    #
#     Question 6     #
#                    #
######################
#############################################################################################################
# Question 6 - Comment on the summary statistics and distribution of the ABV variable.
#
# We observed summary statistics from the ABV data showing that once we filled in the missing 
# values as best as we could, there was a range of 0.10% to 12.80% with a median of 5.65% (median
# is simply the middle value if we were to arrange all the ABVs in either decending or ascending order).
#
# We also found a very common range within the overall range that went from 5.0% ABV to 6.70% ABV and 
# upon further review noticed this is where many commonly mass produced beers fall, for example: Bud 
# Ice (5.5%), Bud Light Platinum (6%), Natural Ice (5.9%), Bud Ice (5.5%), Budweiser (5%), Blue Moon 
# (5%), Stella Artois (5%), Heinekin (5%), Pabst Blue Ribbon (4.74%) and Miller Genuinine Draft (4.6%).
#
#############################################################################################################
# Check on the distribution of ABV
hist(buzzbrews$ABV, col = "#c8102e",
     main = "Histogram of ABV Distribution",
     sub = "Busweiser Consultation",
     xlab = "ABV Ratings")

densityABV <- density(buzzbrews$ABV)
plot(densityABV, 
     main = "Kernel Density of Alcohol By Volume", 
     sub = "Budweiser Consultation",
     xlab = "ABV Ratings")
polygon(densityABV, col = "#c8102e")

ABVsummary <- summary(buzzbrews$ABV)
ABVsummary
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.100   5.000   5.650   5.975   6.700  12.800
############### Not sure what we want to do down here ##################
#ABVSumNames <- names(ABVsummary)
#ABVSumNames <- as.factor(ABVSumNames)
#ABVsummary <- as.list(unname(ABVsummary))
#ABVsum <- data.frame(ABVsummary)
#ABVsum <- (ABVSumNames)
#as.data.frame(ABVsum)
#ABVsum <- rename(ABVsum, "Summary"="", "Value"="ABVsummary")
########################################################################
######################
#                    #
#     Question 7     #
#                    #
######################
#############################################################################################################
# Question 7 - Is there an apparent relationship between the bitterness of the beer and its 
# alcoholic content? Draw a scatter plot.  Make your best judgment of a relationship and 
# EXPLAIN your answer.
#
# We used a scatter plot to viusally explore if there is any sort of relationship between 
# IBU and ABV, in other words can IBU determine ABV or can ABV be used to determine IBU. 
# There was evidence of a positive relationship, but and we will discuss this further shortly, 
# it appears one can potentially predict the other.
#
#############################################################################################################
## Calculate slope and intercept of line of best fit ##
comparisonCoef <- coef(lm(ABV ~ IBU, buzzbrews))
comparisonCoef
## (Intercept)         IBU 
##  4.71799073  0.03142639
#  (Intercept)    MaxIBU 
#   4.71799073  0.03142639 
buzzbrews %>% 
  ggplot(aes(x = IBU, y = ABV, color = "#c8102e")) +
  geom_point(show.legend = FALSE, na.rm = TRUE) +
  geom_abline(intercept =  comparisonCoef[1] , slope = comparisonCoef[2], color = "#c8102E", size = 1) +
  theme_classic() + 
  labs(title = "IBU vs ABV", 
       subtitle = "Budweiser Consultation",
       y = "Alcoholic By Volume", 
       x = "Int'l Bitterness Unit",
       caption="ABV and IBU values imputed where necessary.")

######################
#                    #
#     Question 8     #
#                    #
######################
#############################################################################################################
# Question 8 - .  Budweiser would also like to investigate the difference with respect to IBU and ABV 
# between IPAs (India Pale Ales) and other types of Ale (any beer with “Ale” in its name other than IPA).
# You decide to use KNN classification to investigate this relationship.  Provide statistical evidence one
# way or the other. You can of course assume your audience is comfortable with percentages … KNN is very easy 
# to understand conceptually.
# In addition, while you have decided to use KNN to investigate this relationship (KNN is required) you may
# also feel free to supplement your response to this question with any other methods or techniques you have
# learned.  Creativity and alternative solutions are always encouraged.  
#
# Response: 
# We built a kNN (nearest neighbor) classifier to see if there is a difference between IPA and Ale, and while
# we were at it, we also added in a third class called, "neither." In building the kNN, we wanted to explore 
# what the appropriate number of "neighbors" was to compare to since there is so many observations so close
# together (think New York city and all the noise generated). We found that generally 8 neighbors were the 
# best estimation (we randomly parsed the data 100 times to find the best neighbors value). 
#
# Our classifier was accurate in determining if a beer was an Ale, IPA or neither about 64.5% of the time
# when we used 8 nearest neighbors.
# Next we created some random pairings of IBU and ABV to see how the classifier handled the data and discovered 
# it again was about 64.5% accurate. It is far more accurate identifyly neither style of beer 78% of the time, 
# then IPAs 67.5% of the time and Ale's 26% of the time.
# We also look a look at the ranges for IBU and ABV for each of the 3 broad types of beers IPA, Ale or 
# "neither" and found the following results, showing that it should be more difficult to predict between 
# the 3 different types of beers.
# 
# IPAAle  ABV.min ABV.med ABV.max IBU.min IBU.med IBU.max
# Ale         3.5     5.4    12.8       7    31       120
# IPA         4       6.7     9.9      19    67.6     138
# neither     0.1     5.5    12.5       0    28       130
#
# Additionally we re-visualized the plot chart with a regression line from the previous question, this 
# time showing the plots colored based on the classification of Ale, IPA or neither.
#
#############################################################################################################
#Label Ales, IPAs and neither
buzzbrews$IPAAle = case_when(grepl("\\bIPA\\b", buzzbrews$Beer, ignore.case = TRUE) ~ "IPA",
                             grepl("\\bindia pale ale\\b", buzzbrews$Beer, ignore.case = TRUE) ~ "IPA",
                             grepl("\\bale\\b", buzzbrews$Beer, ignore.case = TRUE ) ~ "Ale",
                             TRUE ~ "neither")
view(buzzbrews)
##### Find the best value of K and train the model ##############
iterations = 100
numks = 25
splitPerc = .70
set.seed(33)
masterAcc = matrix(nrow = iterations, ncol = numks)
  
for(j in 1:iterations)
{
accs = data.frame(accuracy = numeric(30), k = numeric(30))
trainIndices = sample(1:dim(buzzbrews)[1],round(splitPerc * dim(buzzbrews)[1]))
train = buzzbrews[trainIndices,]
test = buzzbrews[-trainIndices,]
for(i in 1:numks)
  {
  classifications = knn(train[,c(7,8)],test[,c(7,8)],train$IPAAle, prob = TRUE, k = i)
  table(classifications,test$IPAAle)
  CM = confusionMatrix(table(classifications,test$IPAAle))
  masterAcc[j,i] = CM$overall[1]
  }
}
MeanAcc = colMeans(masterAcc)
# Visually find the best value of k by using it's location in the dataframe based on the highest Mean value
plot(seq(1,numks,1),MeanAcc, type = "l", 
     col = "#c8201e",
     main = "Value for K Neighbors vs Accuracy", 
     sub = "Budweiser Consultation",
     xlab = "Value of K Neighbors",
     ylab = "Accuracy Rate (Percentage)")

# Locate the value of k based on the best MeanAcc in the dataframe
kvalue = match(max(MeanAcc), MeanAcc)
max(MeanAcc)
## [1] 0.6544813
kvalue
## [1] 9
####### Best value of k = 8 between 59% - 67% Accuracy #####################
####### Train the model using k = 8 #####################
classifications = knn(train[,c(7,8)],test[,c(7,8)],train$IPAAle, prob = TRUE, k = kvalue, use.all = TRUE)
  table(classifications,test$IPAAle)
##                
## classifications Ale IPA neither
##         Ale      67  12      49
##         IPA       9  64      55
##         neither 112  30     325
  CM = confusionMatrix(table(classifications,test$IPAAle))
CM
## Confusion Matrix and Statistics
## 
##                
## classifications Ale IPA neither
##         Ale      67  12      49
##         IPA       9  64      55
##         neither 112  30     325
## 
## Overall Statistics
##                                          
##                Accuracy : 0.6307         
##                  95% CI : (0.5944, 0.666)
##     No Information Rate : 0.5934         
##     P-Value [Acc > NIR] : 0.02199        
##                                          
##                   Kappa : 0.3221         
##                                          
##  Mcnemar's Test P-Value : 4.24e-07       
## 
## Statistics by Class:
## 
##                      Class: Ale Class: IPA Class: neither
## Sensitivity             0.35638    0.60377         0.7576
## Specificity             0.88598    0.89627         0.5170
## Pos Pred Value          0.52344    0.50000         0.6959
## Neg Pred Value          0.79664    0.92941         0.5938
## Prevalence              0.26003    0.14661         0.5934
## Detection Rate          0.09267    0.08852         0.4495
## Detection Prevalence    0.17704    0.17704         0.6459
## Balanced Accuracy       0.62118    0.75002         0.6373
####### Test the Classifier with some random data ###
classifyMyBeers <- data.frame(ABV = c(6,6,5,4,5, 12, 7), 
       IBU = c(78, 65, 55, 38, 100, 148, 98))
classifications = knn(train[,c(7,8)],classifyMyBeers,train$IPAAle, prob = TRUE, k = kvalue)
classifications
## [1] neither Ale     Ale     neither IPA     neither IPA    
## attr(,"prob")
## [1] 0.6666667 0.6666667 0.5000000 0.4444444 0.7777778 0.6666667 0.8181818
## Levels: Ale IPA neither
############ Test Results #######################
#Class: neither   Ale       Ale       neither   IPA       neither   IPA    
#Prob:  0.6250000 0.6250000 0.6250000 0.7500000 0.7500000 0.5000000 0.7777778
##################################################
############# Summary data by classification #############
IPAAleSummary <- buzzbrews %>% 
  group_by(IPAAle) %>% 
  dplyr::summarise(ABV.min = min(ABV), 
                   ABV.med = median(ABV),
                   ABV.max = max(ABV), 
                   IBU.min = min(IBU), 
                   IBU.med = median(IBU),
                   IBU.max = max(IBU))
## `summarise()` ungrouping output (override with `.groups` argument)
IPAAleSummary
########################################################
##### Replot and color by beer style ###################
comparisonCoef <- coef(lm(ABV ~ IBU, buzzbrews))
comparisonCoef
## (Intercept)         IBU 
##  4.71799073  0.03142639
#  (Intercept)    MaxIBU 
#   4.71799073  0.03142639 
buzzbrews %>% 
  ggplot(aes(x = IBU, y = ABV, color = IPAAle)) +
  geom_point(show.legend = TRUE, na.rm = TRUE) +
  geom_abline(intercept =  comparisonCoef[1] , slope = comparisonCoef[2], color = "#c8102E", size = 1) +
  theme_classic() + 
  labs(title = "IBU vs ABV", 
       subtitle = "Budweiser Consultation",
       y = "Alcoholic By Volume", 
       x = "Int'l Bitterness Unit",
       caption="ABV and IBU values imputed where necessary.")

#############################################################################################################################################
#Alternative technique for classifcation
#Hypothesis: Naive Bayes is a stronger ML technique for categorical data
#We implemented a Naive Bayes classifer based on IBU and ABV as a predictor of categotization of style of beer (IPA,Ale, or Neither)
#############################################################################################################################################

#############################################################################################################################################
#Create new DF for Naive Bayes classifer (Don't want to interfere with original Buzzbrews DF)
bayesDat <- buzzbrews
#Make the classifer happy and convert outcome to factor
bayesDat$IPAAle <- as.factor(bayesDat$IPAAle)
#Run this loop to run classifier 100 times to determine mean accruary
iterations = 100
masterAcc = matrix(nrow = iterations,ncol=3)
#Begin the loop
for(j in 1:iterations)
{
#change seed each iteration
set.seed(j)

#Determine training and testing indicices   
trainIndices = sample(seq(1:length(bayesDat$Beer)),round(.8*length(bayesDat$Beer)))
trainBeer = bayesDat[trainIndices,]
testBeer = bayesDat[-trainIndices,]

#Generate model, table, and confusion matrix
model = naiveBayes(trainBeer[,c(7,8)],trainBeer$IPAAle)
table(predict(model,testBeer[,c(7,8)]),testBeer$IPAAle)
CM = confusionMatrix(table(predict(model,testBeer[,c(7,8)]),testBeer$IPAAle))

#Insert current accuracies
masterAcc[j,1] = CM$overall[1]
masterAcc[j,2] = CM$byClass[1]
masterAcc[j,3] = CM$byClass[2]
}
#Mean accuracy
MeanAcc = colMeans(masterAcc)
MeanAcc
## [1] 0.6311411 0.0000000 0.6802324
#Confusion matrix
CM
## Confusion Matrix and Statistics
## 
##          
##           Ale IPA neither
##   Ale       0   0       0
##   IPA       5  60      47
##   neither 107  21     242
## 
## Overall Statistics
##                                           
##                Accuracy : 0.6266          
##                  95% CI : (0.5817, 0.6699)
##     No Information Rate : 0.5996          
##     P-Value [Acc > NIR] : 0.1224          
##                                           
##                   Kappa : 0.2541          
##                                           
##  Mcnemar's Test P-Value : <2e-16          
## 
## Statistics by Class:
## 
##                      Class: Ale Class: IPA Class: neither
## Sensitivity              0.0000     0.7407         0.8374
## Specificity              1.0000     0.8703         0.3368
## Pos Pred Value              NaN     0.5357         0.6541
## Neg Pred Value           0.7676     0.9432         0.5804
## Prevalence               0.2324     0.1680         0.5996
## Detection Rate           0.0000     0.1245         0.5021
## Detection Prevalence     0.0000     0.2324         0.7676
## Balanced Accuracy        0.5000     0.8055         0.5871
#############################################################################################################################################

#############################################################################################################################################
#Naive Bayes Summary
#In summary the classifer correctly selects IPA about 61% of the time, all other ales around 63% of the time, and handles "other" beers about 84% of the time.
#These are consistent with what we can expect from a classifer, and this model in particular does especially well at identifying beers that aren't in the specified categories, this is unsurpirsing as many beers do not fall into this category.
#This classifer complements the KNN classifier as KNN is strong at identifying similar beers, and the Bayesian classifier better handles
#non-grouped beers
#############################################################################################################################################
######################
#                    #
#     Question 9     #
#                    #
######################
#############################################################################################################
#Knock their socks off!  Find one other useful inference from the data that you feel Budweiser may be able to find value in.  You must convince them why it is important and back up your conviction with appropriate statistical evidence. 
#############################################################################################################
#############################################################################################################
#Code summary
#This block examines some potential relationships in the data including relationships between IBU, ABV, style and 
#size of container beer is packaged in

#############################################################################################################
#Create new df for graph data
q9DF <- buzzbrews


#Create pairs plot for obvious evidence of relationships
scatterplotMatrix(~Style+IBU+ABV+Ounces, data=q9DF ,  col="#c8102e", cex=0.5 , 
      pch=c(15,16,17) , 
      main="Scatter plots for visual evidence of relationships"
      )

#Closer examination of relationship between style and ibu

#Plot relationship of style and IBU and different states
ggplot(q9DF, aes(x=Style, y=IBU, color=State)) + geom_point(size=2)   +
  theme_classic() + 
  labs(title = "IBU vs Style", 
       subtitle = "Budweiser Consultation",
       x = "Beer Style (100 differnt styles)",
       y = "Int'l Bitterness Unit (where available)",
       caption="ABV and IBU values imputed where necessary.")+theme(axis.text.x = element_text(angle = 90, hjust = 1, size = 5))

#Do some anova

ibustyleLM <- lm(IBU~Style,data = q9DF)
summary(ibustyleLM)
## 
## Call:
## lm(formula = IBU ~ Style, data = q9DF)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -63.320  -2.593   0.000   0.407  78.701 
## 
## Coefficients:
##                                            Estimate Std. Error t value Pr(>|t|)
## (Intercept)                               2.200e+01  7.182e+00   3.063 0.002215
## StyleAltbier                              1.212e+01  7.715e+00   1.572 0.116161
## StyleAmerican Adjunct Lager              -1.100e+01  7.570e+00  -1.453 0.146355
## StyleAmerican Amber / Red Ale             1.430e+01  7.236e+00   1.976 0.048261
## StyleAmerican Amber / Red Lager           1.250e+00  7.425e+00   0.168 0.866331
## StyleAmerican Barleywine                  7.400e+01  9.272e+00   7.981 2.26e-15
## StyleAmerican Black Ale                   4.690e+01  7.379e+00   6.356 2.49e-10
## StyleAmerican Blonde Ale                 -1.016e+00  7.248e+00  -0.140 0.888492
## StyleAmerican Brown Ale                   7.895e+00  7.284e+00   1.084 0.278536
## StyleAmerican Dark Wheat Ale              5.600e+00  8.144e+00   0.688 0.491737
## StyleAmerican Double / Imperial IPA       7.132e+01  7.250e+00   9.837  < 2e-16
## StyleAmerican Double / Imperial Pilsner   6.300e+01  1.016e+01   6.203 6.56e-10
## StyleAmerican Double / Imperial Stout     4.000e+01  7.940e+00   5.038 5.07e-07
## StyleAmerican India Pale Lager            4.133e+01  9.272e+00   4.458 8.67e-06
## StyleAmerican IPA                         4.563e+01  7.199e+00   6.339 2.77e-10
## StyleAmerican Malt Liquor                -2.200e+01  1.244e+01  -1.769 0.077100
## StyleAmerican Pale Ale (APA)              2.294e+01  7.211e+00   3.181 0.001486
## StyleAmerican Pale Lager                  4.750e+00  7.364e+00   0.645 0.518961
## StyleAmerican Pale Wheat Ale             -1.311e+00  7.256e+00  -0.181 0.856577
## StyleAmerican Pilsner                     1.286e+00  7.464e+00   0.172 0.863247
## StyleAmerican Porter                      9.923e+00  7.287e+00   1.362 0.173399
## StyleAmerican Stout                       1.931e+01  7.364e+00   2.623 0.008783
## StyleAmerican Strong Ale                  4.342e+01  7.678e+00   5.655 1.75e-08
## StyleAmerican White IPA                   2.683e+01  7.808e+00   3.437 0.000599
## StyleAmerican Wild Ale                    5.000e+00  8.293e+00   0.603 0.546623
## StyleBaltic Porter                        3.200e+01  8.293e+00   3.859 0.000117
## StyleBelgian Dark Ale                     7.667e+00  7.808e+00   0.982 0.326229
## StyleBelgian IPA                          3.500e+01  7.570e+00   4.623 3.99e-06
## StyleBelgian Pale Ale                     1.875e+00  7.475e+00   0.251 0.801969
## StyleBelgian Strong Dark Ale              5.000e+01  8.293e+00   6.029 1.91e-09
## StyleBelgian Strong Pale Ale              3.000e+00  8.144e+00   0.368 0.712618
## StyleBerliner Weissbier                  -1.420e+01  7.808e+00  -1.819 0.069081
## StyleBière de Garde                       1.000e+00  8.144e+00   0.123 0.902279
## StyleBock                                 3.000e+00  8.144e+00   0.368 0.712618
## StyleBraggot                             -2.200e+01  1.244e+01  -1.769 0.077100
## StyleCalifornia Common / Steam Beer       1.900e+01  8.293e+00   2.291 0.022048
## StyleChile Beer                           2.000e+00  9.272e+00   0.216 0.829236
## StyleCider                               -2.200e+01  7.374e+00  -2.984 0.002878
## StyleCream Ale                           -1.667e+00  7.425e+00  -0.224 0.822425
## StyleCzech Pilsener                       1.181e+01  7.434e+00   1.589 0.112203
## StyleDoppelbock                           2.500e+00  8.144e+00   0.307 0.758879
## StyleDortmunder / Export Lager            1.800e+00  8.293e+00   0.217 0.828189
## StyleDubbel                              -1.000e+00  8.498e+00  -0.118 0.906334
## StyleDunkelweizen                        -6.000e+00  8.796e+00  -0.682 0.495230
## StyleEnglish Barleywine                   4.467e+01  9.272e+00   4.817 1.55e-06
## StyleEnglish Bitter                       8.667e+00  9.272e+00   0.935 0.350027
## StyleEnglish Brown Ale                    1.200e+00  7.570e+00   0.159 0.874068
## StyleEnglish Dark Mild Ale               -5.484e-13  8.293e+00   0.000 1.000000
## StyleEnglish India Pale Ale (IPA)         3.271e+01  7.715e+00   4.241 2.32e-05
## StyleEnglish Pale Ale                     9.400e+00  7.757e+00   1.212 0.225734
## StyleEnglish Pale Mild Ale               -6.000e+00  9.272e+00  -0.647 0.517620
## StyleEnglish Stout                        4.400e+01  1.016e+01   4.332 1.54e-05
## StyleEnglish Strong Ale                   3.200e+01  8.796e+00   3.638 0.000281
## StyleEuro Dark Lager                      5.000e+00  8.498e+00   0.588 0.556330
## StyleEuro Pale Lager                     -3.332e-13  1.016e+01   0.000 1.000000
## StyleExtra Special / Strong Bitter (ESB)  2.371e+01  7.533e+00   3.148 0.001663
## StyleFlanders Oud Bruin                   1.000e+00  1.244e+01   0.080 0.935935
## StyleFlanders Red Ale                    -2.200e+01  1.244e+01  -1.769 0.077100
## StyleForeign / Export Stout               1.667e+01  8.293e+00   2.010 0.044577
## StyleFruit / Vegetable Beer              -7.800e+00  7.327e+00  -1.065 0.287193
## StyleGerman Pilsener                      1.218e+01  7.379e+00   1.650 0.099037
## StyleGose                                -1.257e+01  7.867e+00  -1.598 0.110201
## StyleGrisette                             3.000e+00  1.244e+01   0.241 0.809447
## StyleHefeweizen                          -4.407e+00  7.359e+00  -0.599 0.549307
## StyleHerbed / Spiced Beer                 2.000e-01  7.940e+00   0.025 0.979906
## StyleIrish Dry Stout                      1.667e+01  8.498e+00   1.961 0.049965
## StyleIrish Red Ale                        6.000e+00  7.757e+00   0.773 0.439333
## StyleKeller Bier / Zwickel Bier           2.333e+00  9.272e+00   0.252 0.801329
## StyleKölsch                              -2.222e-01  7.351e+00  -0.030 0.975886
## StyleKristalweizen                       -2.200e+01  1.244e+01  -1.769 0.077100
## StyleLight Lager                         -1.033e+01  7.757e+00  -1.332 0.182972
## StyleLow Alcohol Beer                    -2.200e+01  1.244e+01  -1.769 0.077100
## StyleMaibock / Helles Bock                5.500e+00  8.498e+00   0.647 0.517551
## StyleMärzen / Oktoberfest                 9.048e-01  7.418e+00   0.122 0.902928
## StyleMead                                -2.200e+01  8.498e+00  -2.589 0.009689
## StyleMilk / Sweet Stout                   3.167e+00  7.867e+00   0.403 0.687352
## StyleMunich Dunkel Lager                 -1.667e+00  8.796e+00  -0.189 0.849735
## StyleMunich Helles Lager                 -4.083e+00  7.533e+00  -0.542 0.587806
## Stylenone                                 2.000e+00  8.498e+00   0.235 0.813954
## StyleOatmeal Stout                        6.091e+00  7.570e+00   0.805 0.421155
## StyleOld Ale                              1.800e+01  1.016e+01   1.772 0.076492
## StyleOther                               -6.000e+00  1.244e+01  -0.482 0.629615
## StylePumpkin Ale                          2.818e+00  7.488e+00   0.376 0.706673
## StyleQuadrupel (Quad)                     2.000e+00  8.796e+00   0.227 0.820153
## StyleRadler                              -2.333e+00  9.272e+00  -0.252 0.801329
## StyleRauchbier                           -2.200e+01  1.016e+01  -2.166 0.030411
## StyleRoggenbier                          -2.000e+00  1.016e+01  -0.197 0.843914
## StyleRussian Imperial Stout               6.450e+01  7.808e+00   8.261 2.40e-16
## StyleRye Beer                             3.000e+01  7.570e+00   3.963 7.63e-05
## StyleSaison / Farmhouse Ale               6.391e+00  7.319e+00   0.873 0.382604
## StyleSchwarzbier                          9.000e+00  7.940e+00   1.134 0.257119
## StyleScotch Ale / Wee Heavy               2.231e+00  7.646e+00   0.292 0.770493
## StyleScottish Ale                         4.909e+00  7.551e+00   0.650 0.515649
## StyleShandy                              -2.200e+01  9.272e+00  -2.373 0.017737
## StyleSmoked Beer                          1.300e+01  1.244e+01   1.045 0.296107
## StyleTripel                               1.500e+00  7.808e+00   0.192 0.847665
## StyleVienna Lager                         2.357e+00  7.533e+00   0.313 0.754363
## StyleWheat Ale                            2.000e+00  1.244e+01   0.161 0.872282
## StyleWinter Warmer                        2.625e+00  7.646e+00   0.343 0.731384
## StyleWitbier                             -5.792e+00  7.321e+00  -0.791 0.428992
##                                             
## (Intercept)                              ** 
## StyleAltbier                                
## StyleAmerican Adjunct Lager                 
## StyleAmerican Amber / Red Ale            *  
## StyleAmerican Amber / Red Lager             
## StyleAmerican Barleywine                 ***
## StyleAmerican Black Ale                  ***
## StyleAmerican Blonde Ale                    
## StyleAmerican Brown Ale                     
## StyleAmerican Dark Wheat Ale                
## StyleAmerican Double / Imperial IPA      ***
## StyleAmerican Double / Imperial Pilsner  ***
## StyleAmerican Double / Imperial Stout    ***
## StyleAmerican India Pale Lager           ***
## StyleAmerican IPA                        ***
## StyleAmerican Malt Liquor                .  
## StyleAmerican Pale Ale (APA)             ** 
## StyleAmerican Pale Lager                    
## StyleAmerican Pale Wheat Ale                
## StyleAmerican Pilsner                       
## StyleAmerican Porter                        
## StyleAmerican Stout                      ** 
## StyleAmerican Strong Ale                 ***
## StyleAmerican White IPA                  ***
## StyleAmerican Wild Ale                      
## StyleBaltic Porter                       ***
## StyleBelgian Dark Ale                       
## StyleBelgian IPA                         ***
## StyleBelgian Pale Ale                       
## StyleBelgian Strong Dark Ale             ***
## StyleBelgian Strong Pale Ale                
## StyleBerliner Weissbier                  .  
## StyleBière de Garde                         
## StyleBock                                   
## StyleBraggot                             .  
## StyleCalifornia Common / Steam Beer      *  
## StyleChile Beer                             
## StyleCider                               ** 
## StyleCream Ale                              
## StyleCzech Pilsener                         
## StyleDoppelbock                             
## StyleDortmunder / Export Lager              
## StyleDubbel                                 
## StyleDunkelweizen                           
## StyleEnglish Barleywine                  ***
## StyleEnglish Bitter                         
## StyleEnglish Brown Ale                      
## StyleEnglish Dark Mild Ale                  
## StyleEnglish India Pale Ale (IPA)        ***
## StyleEnglish Pale Ale                       
## StyleEnglish Pale Mild Ale                  
## StyleEnglish Stout                       ***
## StyleEnglish Strong Ale                  ***
## StyleEuro Dark Lager                        
## StyleEuro Pale Lager                        
## StyleExtra Special / Strong Bitter (ESB) ** 
## StyleFlanders Oud Bruin                     
## StyleFlanders Red Ale                    .  
## StyleForeign / Export Stout              *  
## StyleFruit / Vegetable Beer                 
## StyleGerman Pilsener                     .  
## StyleGose                                   
## StyleGrisette                               
## StyleHefeweizen                             
## StyleHerbed / Spiced Beer                   
## StyleIrish Dry Stout                     *  
## StyleIrish Red Ale                          
## StyleKeller Bier / Zwickel Bier             
## StyleKölsch                                 
## StyleKristalweizen                       .  
## StyleLight Lager                            
## StyleLow Alcohol Beer                    .  
## StyleMaibock / Helles Bock                  
## StyleMärzen / Oktoberfest                   
## StyleMead                                ** 
## StyleMilk / Sweet Stout                     
## StyleMunich Dunkel Lager                    
## StyleMunich Helles Lager                    
## Stylenone                                   
## StyleOatmeal Stout                          
## StyleOld Ale                             .  
## StyleOther                                  
## StylePumpkin Ale                            
## StyleQuadrupel (Quad)                       
## StyleRadler                                 
## StyleRauchbier                           *  
## StyleRoggenbier                             
## StyleRussian Imperial Stout              ***
## StyleRye Beer                            ***
## StyleSaison / Farmhouse Ale                 
## StyleSchwarzbier                            
## StyleScotch Ale / Wee Heavy                 
## StyleScottish Ale                           
## StyleShandy                              *  
## StyleSmoked Beer                            
## StyleTripel                                 
## StyleVienna Lager                           
## StyleWheat Ale                              
## StyleWinter Warmer                          
## StyleWitbier                                
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 10.16 on 2310 degrees of freedom
## Multiple R-squared:  0.8327, Adjusted R-squared:  0.8255 
## F-statistic: 116.1 on 99 and 2310 DF,  p-value: < 2.2e-16
#Look at diagnotic plots
 # optional layout
layout(matrix(c(1,2,3,4),2,2))
# diagnostic plots
plot(ibustyleLM) 
## Warning: not plotting observations with leverage one:
##   140, 178, 184, 956, 1538, 1880, 2024, 2322
## Warning in sqrt(crit * p * (1 - hh)/hh): NaNs produced

## Warning in sqrt(crit * p * (1 - hh)/hh): NaNs produced

#############################################################################################################
#We are left with a linear model of the relationship between IBU and Style
#There is clear evidence of a relationship between certain beer types and IBU
#Model fits all assumptions

#############################################################################################################